base.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. from __future__ import absolute_import, division, unicode_literals
  2. from xml.dom import Node
  3. from ..constants import namespaces, voidElements, spaceCharacters
  4. __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
  5. "TreeWalker", "NonRecursiveTreeWalker"]
  6. DOCUMENT = Node.DOCUMENT_NODE
  7. DOCTYPE = Node.DOCUMENT_TYPE_NODE
  8. TEXT = Node.TEXT_NODE
  9. ELEMENT = Node.ELEMENT_NODE
  10. COMMENT = Node.COMMENT_NODE
  11. ENTITY = Node.ENTITY_NODE
  12. UNKNOWN = "<#UNKNOWN#>"
  13. spaceCharacters = "".join(spaceCharacters)
  14. class TreeWalker(object):
  15. """Walks a tree yielding tokens
  16. Tokens are dicts that all have a ``type`` field specifying the type of the
  17. token.
  18. """
  19. def __init__(self, tree):
  20. """Creates a TreeWalker
  21. :arg tree: the tree to walk
  22. """
  23. self.tree = tree
  24. def __iter__(self):
  25. raise NotImplementedError
  26. def error(self, msg):
  27. """Generates an error token with the given message
  28. :arg msg: the error message
  29. :returns: SerializeError token
  30. """
  31. return {"type": "SerializeError", "data": msg}
  32. def emptyTag(self, namespace, name, attrs, hasChildren=False):
  33. """Generates an EmptyTag token
  34. :arg namespace: the namespace of the token--can be ``None``
  35. :arg name: the name of the element
  36. :arg attrs: the attributes of the element as a dict
  37. :arg hasChildren: whether or not to yield a SerializationError because
  38. this tag shouldn't have children
  39. :returns: EmptyTag token
  40. """
  41. yield {"type": "EmptyTag", "name": name,
  42. "namespace": namespace,
  43. "data": attrs}
  44. if hasChildren:
  45. yield self.error("Void element has children")
  46. def startTag(self, namespace, name, attrs):
  47. """Generates a StartTag token
  48. :arg namespace: the namespace of the token--can be ``None``
  49. :arg name: the name of the element
  50. :arg attrs: the attributes of the element as a dict
  51. :returns: StartTag token
  52. """
  53. return {"type": "StartTag",
  54. "name": name,
  55. "namespace": namespace,
  56. "data": attrs}
  57. def endTag(self, namespace, name):
  58. """Generates an EndTag token
  59. :arg namespace: the namespace of the token--can be ``None``
  60. :arg name: the name of the element
  61. :returns: EndTag token
  62. """
  63. return {"type": "EndTag",
  64. "name": name,
  65. "namespace": namespace}
  66. def text(self, data):
  67. """Generates SpaceCharacters and Characters tokens
  68. Depending on what's in the data, this generates one or more
  69. ``SpaceCharacters`` and ``Characters`` tokens.
  70. For example:
  71. >>> from html5lib.treewalkers.base import TreeWalker
  72. >>> # Give it an empty tree just so it instantiates
  73. >>> walker = TreeWalker([])
  74. >>> list(walker.text(''))
  75. []
  76. >>> list(walker.text(' '))
  77. [{u'data': ' ', u'type': u'SpaceCharacters'}]
  78. >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
  79. [{u'data': ' ', u'type': u'SpaceCharacters'},
  80. {u'data': u'abc', u'type': u'Characters'},
  81. {u'data': u' ', u'type': u'SpaceCharacters'}]
  82. :arg data: the text data
  83. :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
  84. """
  85. data = data
  86. middle = data.lstrip(spaceCharacters)
  87. left = data[:len(data) - len(middle)]
  88. if left:
  89. yield {"type": "SpaceCharacters", "data": left}
  90. data = middle
  91. middle = data.rstrip(spaceCharacters)
  92. right = data[len(middle):]
  93. if middle:
  94. yield {"type": "Characters", "data": middle}
  95. if right:
  96. yield {"type": "SpaceCharacters", "data": right}
  97. def comment(self, data):
  98. """Generates a Comment token
  99. :arg data: the comment
  100. :returns: Comment token
  101. """
  102. return {"type": "Comment", "data": data}
  103. def doctype(self, name, publicId=None, systemId=None):
  104. """Generates a Doctype token
  105. :arg name:
  106. :arg publicId:
  107. :arg systemId:
  108. :returns: the Doctype token
  109. """
  110. return {"type": "Doctype",
  111. "name": name,
  112. "publicId": publicId,
  113. "systemId": systemId}
  114. def entity(self, name):
  115. """Generates an Entity token
  116. :arg name: the entity name
  117. :returns: an Entity token
  118. """
  119. return {"type": "Entity", "name": name}
  120. def unknown(self, nodeType):
  121. """Handles unknown node types"""
  122. return self.error("Unknown node type: " + nodeType)
  123. class NonRecursiveTreeWalker(TreeWalker):
  124. def getNodeDetails(self, node):
  125. raise NotImplementedError
  126. def getFirstChild(self, node):
  127. raise NotImplementedError
  128. def getNextSibling(self, node):
  129. raise NotImplementedError
  130. def getParentNode(self, node):
  131. raise NotImplementedError
  132. def __iter__(self):
  133. currentNode = self.tree
  134. while currentNode is not None:
  135. details = self.getNodeDetails(currentNode)
  136. type, details = details[0], details[1:]
  137. hasChildren = False
  138. if type == DOCTYPE:
  139. yield self.doctype(*details)
  140. elif type == TEXT:
  141. for token in self.text(*details):
  142. yield token
  143. elif type == ELEMENT:
  144. namespace, name, attributes, hasChildren = details
  145. if (not namespace or namespace == namespaces["html"]) and name in voidElements:
  146. for token in self.emptyTag(namespace, name, attributes,
  147. hasChildren):
  148. yield token
  149. hasChildren = False
  150. else:
  151. yield self.startTag(namespace, name, attributes)
  152. elif type == COMMENT:
  153. yield self.comment(details[0])
  154. elif type == ENTITY:
  155. yield self.entity(details[0])
  156. elif type == DOCUMENT:
  157. hasChildren = True
  158. else:
  159. yield self.unknown(details[0])
  160. if hasChildren:
  161. firstChild = self.getFirstChild(currentNode)
  162. else:
  163. firstChild = None
  164. if firstChild is not None:
  165. currentNode = firstChild
  166. else:
  167. while currentNode is not None:
  168. details = self.getNodeDetails(currentNode)
  169. type, details = details[0], details[1:]
  170. if type == ELEMENT:
  171. namespace, name, attributes, hasChildren = details
  172. if (namespace and namespace != namespaces["html"]) or name not in voidElements:
  173. yield self.endTag(namespace, name)
  174. if self.tree is currentNode:
  175. currentNode = None
  176. break
  177. nextSibling = self.getNextSibling(currentNode)
  178. if nextSibling is not None:
  179. currentNode = nextSibling
  180. break
  181. else:
  182. currentNode = self.getParentNode(currentNode)